QOL Healthcare Utilization (VSURF and Analysis of Deviance)

Author

Miguel Fudolig

Code
library(tidyverse)
library(ggplot2)
library(car)
library(caret)
library(rFerns)
library(VSURF)
library(glmnet)
library(Boruta)
library(doParallel)
library(Hmisc)
library(mlbench)
library(pROC)

Setup

Code
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese"),
         Religion=relevel(Religion,ref="None")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above"))) |> 
  mutate(across(`Familiarity with America`:`Familiarity with Ethnic Origin`,~factor(.x,levels=c("Very low","Low", "High", "Very high"))),
         across(`Identify Ethnically`,~factor(.x,levels=c("Not at all","Not very close","Somewhat close","Very close"))),
         across(`Belonging`,~factor(.x,levels=c("Not at all","Not very much","Somewhat","Very much"))),
         `Primary Language` = as.factor(`Primary Language`))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
Code
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Standardizing Age, Duration of Residency, and

Code
qol <- qol |> mutate(across(c(Age,`Duration of Residency`,`Education Completed`),~scale(.x)))

Unmet Health Need

VSURF Approach

Important

We will be using the Prediction Set to run an analysis of deviance to check model performance.

Code
rfdata <- qol |> select(`Unmet Health Need`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Health.Need)
Characteristic N = 1,7211
Unmet.Health.Need
    0 1,548 (90%)
    Yes 173 (10%)
1 n (%)
Code
n_minority <- rfdata |> filter(Unmet.Health.Need=="Yes")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)
set.seed(123321)
VSURF(Unmet.Health.Need~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Health.Need ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 19.2 secs 

 VSURF selected: 
    27 variables at thresholding step (in 9.1 secs)
    1 variables at interpretation step (in 9.5 secs)
    1 variables at prediction step (in 0.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 7 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "English.Speaking"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "English.Speaking"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.09904126

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                English.Speaking    0.01433       0.00089 black
2                             Age    0.01065       0.00060 black
3           Duration.of.Residency    0.01004       0.00040 black
4                       Ethnicity    0.00785       0.00062   red
5                  Discrimination    0.00657       0.00025 black
6                Dental.Insurance    0.00616       0.00045 black
7                        Religion    0.00574       0.00058 black
8            English.Difficulties    0.00492       0.00044 black
9                  Marital.Status    0.00479       0.00038 black
10               Health.Insurance    0.00338       0.00047 black
11           Full.Time.Employment    0.00283       0.00025 black
12                  Income_median    0.00228       0.00038 black
13            Identify.Ethnically    0.00227       0.00028 black
14                        US.Born    0.00209       0.00018 black
15       Familiarity.with.America    0.00191       0.00032 black
16               Primary.Language    0.00182       0.00020 black
17                      Belonging    0.00179       0.00029 black
18                        Student    0.00170       0.00019 black
19 Familiarity.with.Ethnic.Origin    0.00068       0.00030 black
20                        Retired    0.00066       0.00013 black
21                         Gender    0.00054       0.00019 black
22                      Homemaker    0.00013       0.00014 black
23           Part.Time.Employment    0.00011       0.00017 black
24        Self.Employed.Full.Time    0.00000       0.00000 black
25        Self.Employed.Part.Time    0.00000       0.00000 black
26                       Disabled    0.00000       0.00000 black
27                     Unemployed    0.00000       0.00000 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmethealth.png", width=6, height=4.5,units="in")

Analysis of Deviance Using the Prediction Set

Code
all_formula <- Unmet.Health.Need~.
pred_vars <- names(rfdata[,-1])[vsurf.mod$varselect.pred]


pred_vars <- if("Ethnicity" %in% pred_vars){
  pred_vars
} else {
  c(pred_vars,"Ethnicity")
  }
mod_form <- reformulate(pred_vars, response="Unmet.Health.Need")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Health.Need")

options(contrasts = c("contr.sum","contr.poly"))
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Health.Need ~ English.Speaking
Model 2: Unmet.Health.Need ~ English.Speaking + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
1      1717     1058.6                        
2      1712     1042.0  5   16.556 0.005423 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1109.088        1088.391
Code
car::Anova(mod1, test.statistic = "F")
Analysis of Deviance Table (Type II tests)

Response: Unmet.Health.Need
Error estimate based on Pearson residuals 

                  Sum Sq   Df F value    Pr(>F)    
English.Speaking   45.32    3 14.8808 1.449e-09 ***
Ethnicity          16.56    5  3.2615   0.00618 ** 
Residuals        1738.14 1712                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -2.03556    0.12173 -16.722  < 2e-16 ***
English.Speaking1  1.01133    0.20902   4.839 1.31e-06 ***
English.Speaking2  0.32016    0.13649   2.346    0.019 *  
English.Speaking3 -0.84052    0.16431  -5.116 3.13e-07 ***
Ethnicity1        -0.21044    0.18711  -1.125    0.261    
Ethnicity2        -0.34833    0.22688  -1.535    0.125    
Ethnicity3         0.01486    0.28599   0.052    0.959    
Ethnicity4         0.29352    0.18301   1.604    0.109    
Ethnicity5        -0.29265    0.39474  -0.741    0.458    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1122.9  on 1720  degrees of freedom
Residual deviance: 1042.0  on 1712  degrees of freedom
AIC: 1060

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.0957 0.0148 Inf    0.0704     0.129
 Asian Indian 0.0844 0.0192 Inf    0.0536     0.130
 Filipino     0.1170 0.0339 Inf    0.0652     0.201
 Korean       0.1491 0.0227 Inf    0.1098     0.199
 Other        0.0888 0.0379 Inf    0.0375     0.196
 Vietnamese   0.1835 0.0242 Inf    0.1407     0.236

Results are averaged over the levels of: English.Speaking 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff") |> summary(infer=T)
 contrast            estimate    SE  df asymp.LCL asymp.UCL z.ratio p.value
 Chinese effect       -0.2104 0.187 Inf   -0.7041     0.283  -1.125  0.3911
 Asian Indian effect  -0.3483 0.227 Inf   -0.9469     0.250  -1.535  0.2494
 Filipino effect       0.0149 0.286 Inf   -0.7397     0.769   0.052  0.9586
 Korean effect         0.2935 0.183 Inf   -0.1893     0.776   1.604  0.2494
 Other effect         -0.2927 0.395 Inf   -1.3341     0.749  -0.741  0.5502
 Vietnamese effect     0.5431 0.172 Inf    0.0901     0.996   3.163  0.0094

Results are averaged over the levels of: English.Speaking 
Results are given on the log odds ratio (not the response) scale. 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
P value adjustment: fdr method for 6 tests 

Unmet Dental Need

VSURF Approach

Code
rfdata <- qol |> select(`Unmet Dental Needs`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Unmet.Dental.Needs)
Characteristic N = 1,7171
Unmet.Dental.Needs
    0 1,528 (89%)
    Yes 189 (11%)
1 n (%)
Code
n_minority <- rfdata |> filter(Unmet.Dental.Needs=="Yes")|> nrow()
set.seed(123321)
rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

VSURF(Unmet.Dental.Needs~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Unmet.Dental.Needs ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 22.9 secs 

 VSURF selected: 
    26 variables at thresholding step (in 11.6 secs)
    1 variables at interpretation step (in 10.7 secs)
    1 variables at prediction step (in 0.6 secs)

 VSURF ran in parallel on a PSOCK cluster and used 7 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Dental.Insurance"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Dental.Insurance"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.1097263

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                Dental.Insurance    0.01403       0.00057 black
2                             Age    0.01082       0.00070 black
3                English.Speaking    0.01023       0.00057 black
4                        Religion    0.00785       0.00056 black
5           Duration.of.Residency    0.00687       0.00052 black
6                  Marital.Status    0.00548       0.00027 black
7                   Income_median    0.00543       0.00036 black
8                       Ethnicity    0.00411       0.00038   red
9        Familiarity.with.America    0.00385       0.00035 black
10           English.Difficulties    0.00249       0.00038 black
11               Primary.Language    0.00240       0.00031 black
12           Full.Time.Employment    0.00202       0.00030 black
13               Health.Insurance    0.00170       0.00044 black
14                        US.Born    0.00170       0.00025 black
15                        Retired    0.00152       0.00024 black
16                 Discrimination    0.00125       0.00028 black
17                        Student    0.00110       0.00021 black
18 Familiarity.with.Ethnic.Origin    0.00095       0.00033 black
19                      Belonging    0.00091       0.00037 black
20            Identify.Ethnically    0.00066       0.00034 black
21                      Homemaker    0.00032       0.00013 black
22                         Gender    0.00025       0.00022 black
23        Self.Employed.Full.Time    0.00000       0.00000 black
24        Self.Employed.Part.Time    0.00000       0.00000 black
25                       Disabled    0.00000       0.00000 black
26                     Unemployed    0.00000       0.00000 black
27           Part.Time.Employment   -0.00034       0.00015 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_unmetdental.png", width=6, height=4.5,units="in")
Note

Dental Insurance is the only variable in the Prediction Set, which means Ethnicity might not be a significant predictor of unmet dental needs.

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Unmet.Dental.Needs")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Unmet.Dental.Needs")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Unmet.Dental.Needs ~ Dental.Insurance
Model 2: Unmet.Dental.Needs ~ Dental.Insurance + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)
1      1715     1097.7                     
2      1710     1091.7  5    5.946   0.3115
Code
performance::check_model(mod1)

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1143.842        1112.546
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -2.14863    0.10164 -21.139   <2e-16 ***
Dental.Insurance1  0.74942    0.08515   8.801   <2e-16 ***
Ethnicity1        -0.09192    0.16793  -0.547    0.584    
Ethnicity2        -0.17522    0.18386  -0.953    0.341    
Ethnicity3         0.03226    0.25564   0.126    0.900    
Ethnicity4         0.22752    0.16628   1.368    0.171    
Ethnicity5        -0.27426    0.34240  -0.801    0.423    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1190.5  on 1716  degrees of freedom
Residual deviance: 1091.7  on 1710  degrees of freedom
AIC: 1105.7

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.0962 0.0146 Inf    0.0712     0.129
 Asian Indian 0.0892 0.0154 Inf    0.0632     0.124
 Filipino     0.1075 0.0274 Inf    0.0644     0.174
 Korean       0.1277 0.0185 Inf    0.0957     0.169
 Other        0.0814 0.0300 Inf    0.0388     0.163
 Vietnamese   0.1339 0.0196 Inf    0.0998     0.177

Results are averaged over the levels of: Dental.Insurance 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")|> summary(infer=T)
 contrast            estimate    SE  df asymp.LCL asymp.UCL z.ratio p.value
 Chinese effect       -0.0919 0.168 Inf    -0.535     0.351  -0.547  0.7010
 Asian Indian effect  -0.1752 0.184 Inf    -0.660     0.310  -0.953  0.6347
 Filipino effect       0.0323 0.256 Inf    -0.642     0.707   0.126  0.8996
 Korean effect         0.2275 0.166 Inf    -0.211     0.666   1.368  0.5136
 Other effect         -0.2743 0.342 Inf    -1.178     0.629  -0.801  0.6347
 Vietnamese effect     0.2816 0.169 Inf    -0.165     0.729   1.662  0.5136

Results are averaged over the levels of: Dental.Insurance 
Results are given on the log odds ratio (not the response) scale. 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
P value adjustment: fdr method for 6 tests 
Note

The non-significant analysis of deviance implies that Ethnicity might not be a significant predictor of unmet dental needs.

Physical Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Physical Check-up`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Physical.Check.up)
Characteristic N = 1,7141
Physical.Check.up
    0 524 (31%)
    Yes 1,190 (69%)
1 n (%)
Code
n_minority <- rfdata |> filter(Physical.Check.up=="0")|> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

set.seed(123321)
set.seed(123321)
VSURF(Physical.Check.up~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Physical.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 39.5 secs 

 VSURF selected: 
    27 variables at thresholding step (in 20.8 secs)
    6 variables at interpretation step (in 14.6 secs)
    4 variables at prediction step (in 4.1 secs)

 VSURF ran in parallel on a PSOCK cluster and used 7 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Health.Insurance"      "Dental.Insurance"     
[4] "Income_median"        
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Age"                   "Health.Insurance"     
[4] "Dental.Insurance"      "Income_median"         "Student"              
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2763711
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.03324       0.00087 black
2                             Age    0.02493       0.00065 black
3                Health.Insurance    0.01433       0.00061 black
4                Dental.Insurance    0.01274       0.00067 black
5                   Income_median    0.00655       0.00042 black
6                         Student    0.00646       0.00062 black
7                       Ethnicity    0.00614       0.00062   red
8                    EnglishSpeak    0.00610       0.00059 black
9                  Marital.Status    0.00593       0.00034 black
10                       Religion    0.00569       0.00082 black
11                    EnglishDiff    0.00477       0.00047 black
12                        Retired    0.00344       0.00034 black
13                     Employment    0.00344       0.00042 black
14       Familiarity.with.America    0.00254       0.00037 black
15                         Gender    0.00225       0.00048 black
16 Familiarity.with.Ethnic.Origin    0.00223       0.00053 black
17                 Discrimination    0.00117       0.00038 black
18               Primary.Language    0.00115       0.00032 black
19                        US.Born    0.00086       0.00018 black
20            Identify.Ethnically    0.00077       0.00042 black
21           Part.Time.Employment    0.00058       0.00025 black
22                      Homemaker    0.00024       0.00016 black
23                      Belonging    0.00008       0.00030 black
24        Self.Employed.Full.Time    0.00000       0.00000 black
25        Self.Employed.Part.Time    0.00000       0.00000 black
26                       Disabled    0.00000       0.00000 black
27                     Unemployed    0.00000       0.00000 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_PC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Physical.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Physical.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Physical.Check.up ~ Duration.of.Residency + Health.Insurance + 
    Dental.Insurance + Income_median
Model 2: Physical.Check.up ~ Duration.of.Residency + Health.Insurance + 
    Dental.Insurance + Income_median + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1709     1855.7                          
2      1704     1826.1  5   29.563 1.797e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1900.562        1892.892
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)            0.52527    0.09241   5.684 1.32e-08 ***
Duration.of.Residency  0.66004    0.07135   9.251  < 2e-16 ***
Health.Insurance1     -0.49501    0.08975  -5.515 3.48e-08 ***
Dental.Insurance1     -0.21240    0.06805  -3.121 0.001801 ** 
Income_median1        -0.22176    0.06542  -3.390 0.000699 ***
Ethnicity1             0.02811    0.11660   0.241 0.809513    
Ethnicity2             0.06514    0.12321   0.529 0.596980    
Ethnicity3             0.50145    0.18332   2.735 0.006230 ** 
Ethnicity4            -0.44553    0.12263  -3.633 0.000280 ***
Ethnicity5            -0.50148    0.20344  -2.465 0.013700 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2110.4  on 1713  degrees of freedom
Residual deviance: 1826.1  on 1704  degrees of freedom
AIC: 1846.1

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.633 0.0309 Inf     0.571     0.691
 Asian Indian 0.642 0.0325 Inf     0.576     0.702
 Filipino     0.735 0.0423 Inf     0.644     0.809
 Korean       0.518 0.0350 Inf     0.449     0.586
 Other        0.504 0.0607 Inf     0.387     0.620
 Vietnamese   0.705 0.0333 Inf     0.635     0.765

Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff")|> summary(infer=T)
 contrast            estimate    SE  df asymp.LCL asymp.UCL z.ratio p.value
 Chinese effect        0.0281 0.117 Inf   -0.2795    0.3357   0.241  0.8095
 Asian Indian effect   0.0651 0.123 Inf   -0.2599    0.3902   0.529  0.7164
 Filipino effect       0.5015 0.183 Inf    0.0178    0.9851   2.735  0.0187
 Korean effect        -0.4455 0.123 Inf   -0.7691   -0.1220  -3.633  0.0017
 Other effect         -0.5015 0.203 Inf   -1.0382    0.0352  -2.465  0.0206
 Vietnamese effect     0.3523 0.138 Inf   -0.0114    0.7160   2.555  0.0206

Results are averaged over the levels of: Health.Insurance, Dental.Insurance, Income_median 
Results are given on the log odds ratio (not the response) scale. 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
P value adjustment: fdr method for 6 tests 

Dental Check-up

VSURF

Code
rfdata <- qol |> 
  select(`Dentist Check-up`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Dentist.Check.up)
Characteristic N = 1,7091
Dentist.Check.up
    0 689 (40%)
    Yes 1,020 (60%)
1 n (%)
Code
n_minority <- rfdata |> filter(Dentist.Check.up=="0") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

set.seed(123321)
VSURF(Dentist.Check.up~.,
      rfdata,
      na.action="na.omit",
      parallel=T,verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Dentist.Check.up ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 42.3 secs 

 VSURF selected: 
    27 variables at thresholding step (in 22.1 secs)
    3 variables at interpretation step (in 17.9 secs)
    3 variables at prediction step (in 2.3 secs)

 VSURF ran in parallel on a PSOCK cluster and used 7 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Duration.of.Residency" "Dental.Insurance"      "Religion"             
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Duration.of.Residency" "Dental.Insurance"      "Religion"             
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.2855471
Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1           Duration.of.Residency    0.05253       0.00115 black
2                Dental.Insurance    0.04744       0.00074 black
3                        Religion    0.01716       0.00079 black
4                             Age    0.01390       0.00068 black
5                       Ethnicity    0.01241       0.00062   red
6                Health.Insurance    0.00908       0.00038 black
7                    EnglishSpeak    0.00735       0.00072 black
8                   Income_median    0.00709       0.00038 black
9             Identify.Ethnically    0.00359       0.00040 black
10                    EnglishDiff    0.00348       0.00045 black
11                         Gender    0.00317       0.00044 black
12                 Marital.Status    0.00296       0.00039 black
13                      Belonging    0.00222       0.00050 black
14       Familiarity.with.America    0.00219       0.00038 black
15 Familiarity.with.Ethnic.Origin    0.00211       0.00038 black
16                        Student    0.00207       0.00027 black
17               Primary.Language    0.00173       0.00034 black
18                     Employment    0.00145       0.00031 black
19                 Discrimination    0.00074       0.00027 black
20                        US.Born    0.00070       0.00021 black
21                      Homemaker    0.00014       0.00022 black
22           Part.Time.Employment    0.00008       0.00017 black
23                        Retired    0.00005       0.00017 black
24        Self.Employed.Full.Time    0.00000       0.00000 black
25        Self.Employed.Part.Time    0.00000       0.00000 black
26                       Disabled    0.00000       0.00000 black
27                     Unemployed    0.00000       0.00000 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_DC.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred],"Ethnicity")
mod_form <- reformulate(pred_vars, response="Dentist.Check.up")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Dentist.Check.up")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Dentist.Check.up ~ Duration.of.Residency + Dental.Insurance + 
    Religion
Model 2: Dentist.Check.up ~ Duration.of.Residency + Dental.Insurance + 
    Religion + Ethnicity
  Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
1      1700     1897.6                       
2      1695     1886.3  5   11.237  0.04688 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1      1990.564        1964.582
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)            0.241988   0.101446   2.385   0.0171 *  
Duration.of.Residency  0.663596   0.067581   9.819   <2e-16 ***
Dental.Insurance1     -0.794684   0.059367 -13.386   <2e-16 ***
Religion1              0.031546   0.178166   0.177   0.8595    
Religion2              0.375181   0.194618   1.928   0.0539 .  
Religion3              0.185876   0.179582   1.035   0.3006    
Religion4             -0.275420   0.257288  -1.070   0.2844    
Religion5             -0.302753   0.332627  -0.910   0.3627    
Religion6             -0.006214   0.391627  -0.016   0.9873    
Ethnicity1             0.358734   0.145966   2.458   0.0140 *  
Ethnicity2            -0.173972   0.260818  -0.667   0.5048    
Ethnicity3             0.257446   0.192288   1.339   0.1806    
Ethnicity4            -0.035071   0.150877  -0.232   0.8162    
Ethnicity5            -0.225744   0.214870  -1.051   0.2934    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2304.7  on 1708  degrees of freedom
Residual deviance: 1886.4  on 1695  degrees of freedom
AIC: 1914.4

Number of Fisher Scoring iterations: 4
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity     prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.644 0.0411 Inf     0.560     0.720
 Asian Indian 0.515 0.0602 Inf     0.399     0.630
 Filipino     0.621 0.0557 Inf     0.507     0.723
 Korean       0.550 0.0469 Inf     0.457     0.639
 Other        0.503 0.0634 Inf     0.381     0.624
 Vietnamese   0.514 0.0454 Inf     0.425     0.601

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.432 0.209 Inf     0.974      2.10    1   2.458
 Asian Indian effect      0.840 0.219 Inf     0.422      1.67    1  -0.667
 Filipino effect          1.294 0.249 Inf     0.779      2.15    1   1.339
 Korean effect            0.966 0.146 Inf     0.648      1.44    1  -0.232
 Other effect             0.798 0.171 Inf     0.453      1.41    1  -1.051
 Vietnamese effect        0.834 0.125 Inf     0.561      1.24    1  -1.206
 p.value
  0.0839
  0.6057
  0.4402
  0.8162
  0.4402
  0.4402

Results are averaged over the levels of: Dental.Insurance, Religion 
Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale 

Folk medicine

VSURF

Code
rfdata <- qol |> select(`Folkmedicine`,Ethnicity, Age, Gender,Religion,`Marital Status`, `Full Time Employment`:Retired,  Income_median, `US Born`:`Discrimination`,`Health Insurance`,`Dental Insurance`) |> 
    na.omit() |>
  as.data.frame() |> 
  rename_with(make.names)
Code
rfdata |> gtsummary::tbl_summary(include=Folkmedicine)
Characteristic N = 1,6951
Folkmedicine
    0 1,476 (87%)
    Yes 219 (13%)
1 n (%)
Code
n_minority <- rfdata |> filter(Folkmedicine=="Yes") |> nrow()

rf_options_for_vsurf <- list(
  sampsize = c(n_minority, n_minority), # Sample 'n_minority' from each class
  strata = rfdata[,1],                     # Stratify by the response variable
  importance = TRUE                     # Ensure importance is calculated
)

set.seed(123321)

VSURF(Folkmedicine~.,
      rfdata,
      na.action="na.omit",
      parallel=T,
      verbose=F,
      rf.options=rf_options_for_vsurf)->vsurf.mod
Warning in VSURF.formula(Folkmedicine ~ ., rfdata, na.action = "na.omit", : VSURF with a formula-type call outputs selected variables
which are indices of the input matrix based on the formula:
you may reorder these to get indices of the original data
Code
vsurf.mod |> summary()

 VSURF computation time: 27.6 secs 

 VSURF selected: 
    22 variables at thresholding step (in 16.6 secs)
    2 variables at interpretation step (in 10 secs)
    2 variables at prediction step (in 1 secs)

 VSURF ran in parallel on a PSOCK cluster and used 7 cores 
Code
names(rfdata[,-1])[vsurf.mod$varselect.pred]
[1] "Age"       "Ethnicity"
Code
names(rfdata[,-1])[vsurf.mod$varselect.interp]
[1] "Age"       "Ethnicity"
Code
plot(vsurf.mod)

Code
vsurf.mod$mean.perf
[1] 0.130354

Importance

Code
vi<- data.frame(Variable=names(rfdata[,-1])[vsurf.mod$imp.mean.dec.ind],
                Importance = vsurf.mod$imp.mean.dec,
                sd_Importance = vsurf.mod$imp.sd.dec
)|> 
  mutate(fill = case_when(Variable=="Ethnicity"~"red",
                                                 .default="black"))

vi |> mutate(across(Importance:sd_Importance,~round(.x,5)))
                         Variable Importance sd_Importance  fill
1                             Age    0.02416       0.00084 black
2                       Ethnicity    0.01459       0.00064   red
3           Duration.of.Residency    0.01291       0.00083 black
4                English.Speaking    0.00798       0.00075 black
5                        Religion    0.00553       0.00045 black
6                   Income_median    0.00545       0.00035 black
7                         Student    0.00460       0.00028 black
8            English.Difficulties    0.00425       0.00061 black
9                  Marital.Status    0.00382       0.00032 black
10                        Retired    0.00329       0.00047 black
11       Familiarity.with.America    0.00286       0.00040 black
12               Primary.Language    0.00277       0.00036 black
13 Familiarity.with.Ethnic.Origin    0.00238       0.00031 black
14           Full.Time.Employment    0.00184       0.00031 black
15                 Discrimination    0.00139       0.00022 black
16                      Homemaker    0.00123       0.00033 black
17            Identify.Ethnically    0.00118       0.00039 black
18                        US.Born    0.00104       0.00013 black
19               Dental.Insurance    0.00097       0.00025 black
20               Health.Insurance    0.00067       0.00016 black
21                      Belonging    0.00059       0.00036 black
22                         Gender    0.00034       0.00020 black
23           Part.Time.Employment    0.00002       0.00014 black
24        Self.Employed.Full.Time    0.00000       0.00000 black
25        Self.Employed.Part.Time    0.00000       0.00000 black
26                       Disabled    0.00000       0.00000 black
27                     Unemployed    0.00000       0.00000 black
Code
importance_plot <- ggplot(vi, aes(x = reorder(Variable, Importance), y = Importance, fill=fill))+
  geom_bar(stat = "identity",alpha=0.4) +
  geom_errorbar(aes(ymin=Importance-sd_Importance, ymax = Importance+sd_Importance))+
  
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  scale_fill_manual(values=c("black","red"),
                    guide="none")
  
plot(importance_plot)

Code
ggsave(filename = "FinalFigures/VSURF_importance_AltMedicine.png", width=6, height=4.5,units="in")

Analysis of Deviance

Code
pred_vars <- c(names(rfdata[,-1])[vsurf.mod$varselect.pred])
mod_form <- reformulate(pred_vars, response="Folkmedicine")

mod_form_noEth <- reformulate(pred_vars[!pred_vars %in%c("Ethnicity")], response="Folkmedicine")
mod1 <- glm(mod_form, family="binomial", data=rfdata)
mod2 <- glm(mod_form_noEth, family="binomial", data=rfdata)
anova(mod2,mod1)
Analysis of Deviance Table

Model 1: Folkmedicine ~ Age
Model 2: Folkmedicine ~ Age + Ethnicity
  Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
1      1693     1263.1                          
2      1688     1211.6  5   51.507 6.808e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Code
performance::check_model(mod1)
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.
Some of the variables were in matrix-format - probably you used
  `scale()` on your data?
  If so, and you get an error, please try `datawizard::standardize()` to
  standardize your data.

Code
data.frame(BIC_ethnicity = BIC(mod1),
           BIC_noethnicity=BIC(mod2))
  BIC_ethnicity BIC_noethnicity
1       1263.66         1277.99
Code
summary(mod1)

Call:
glm(formula = mod_form, family = "binomial", data = rfdata)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.11990    0.09787 -21.660  < 2e-16 ***
Age          0.46420    0.07478   6.207 5.39e-10 ***
Ethnicity1   0.53776    0.14362   3.744 0.000181 ***
Ethnicity2  -0.32424    0.18668  -1.737 0.082415 .  
Ethnicity3  -0.22481    0.24241  -0.927 0.353716    
Ethnicity4   0.81181    0.14791   5.488 4.05e-08 ***
Ethnicity5  -0.22784    0.31906  -0.714 0.475154    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1304.7  on 1694  degrees of freedom
Residual deviance: 1211.6  on 1688  degrees of freedom
AIC: 1225.6

Number of Fisher Scoring iterations: 5
Code
mod1 |>  emmeans::emmeans(~Ethnicity, type="response")
 Ethnicity      prob     SE  df asymp.LCL asymp.UCL
 Chinese      0.1646 0.0181 Inf    0.1320    0.2032
 Asian Indian 0.0768 0.0139 Inf    0.0536    0.1089
 Filipino     0.0841 0.0210 Inf    0.0511    0.1355
 Korean       0.2058 0.0228 Inf    0.1647    0.2540
 Other        0.0839 0.0286 Inf    0.0423    0.1598
 Vietnamese   0.0609 0.0129 Inf    0.0400    0.0917

Confidence level used: 0.95 
Intervals are back-transformed from the logit scale 
Code
mod1 |>  emmeans::emmeans(~Ethnicity) |> emmeans::contrast(method="eff", type="response") |> summary(infer=T)
 contrast            odds.ratio    SE  df asymp.LCL asymp.UCL null z.ratio
 Chinese effect           1.712 0.246 Inf     1.172     2.501    1   3.744
 Asian Indian effect      0.723 0.135 Inf     0.442     1.183    1  -1.737
 Filipino effect          0.799 0.194 Inf     0.421     1.514    1  -0.927
 Korean effect            2.252 0.333 Inf     1.524     3.327    1   5.488
 Other effect             0.796 0.254 Inf     0.343     1.848    1  -0.714
 Vietnamese effect        0.564 0.116 Inf     0.327     0.972    1  -2.776
 p.value
  0.0005
  0.1236
  0.4245
  <.0001
  0.4752
  0.0110

Confidence level used: 0.95 
Conf-level adjustment: bonferroni method for 6 estimates 
Intervals are back-transformed from the log odds ratio scale 
P value adjustment: fdr method for 6 tests 
Tests are performed on the log odds ratio scale